{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Chatbot with memory\n",
    "\n",
    "A question-answering chatbot designed to handle complex bank statement data, utilizing LangChain, LanceDB, and PyMuPDF4LLM models."
   ],
   "metadata": {
    "id": "exWKR2roQk6W"
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Install necessary packages\n"
   ],
   "metadata": {
    "id": "JfO_nCHbQoTF"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "36mSTi9Y8tB6",
    "outputId": "f8927aa8-c588-4cf9-87a9-cb520b4754bd"
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Collecting langchain\n",
      "  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m975.5/975.5 kB\u001b[0m \u001b[31m15.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting openai\n",
      "  Downloading openai-1.35.10-py3-none-any.whl (328 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m328.3/328.3 kB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n",
      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.31)\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.5)\n",
      "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n",
      "Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)\n",
      "  Downloading langchain_core-0.2.11-py3-none-any.whl (337 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m337.4/337.4 kB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)\n",
      "  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)\n",
      "Collecting langsmith<0.2.0,>=0.1.17 (from langchain)\n",
      "  Downloading langsmith-0.1.83-py3-none-any.whl (127 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.5/127.5 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.25.2)\n",
      "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.8.0)\n",
      "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n",
      "Requirement already satisfied: tenacity!=8.4.0,<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.4.2)\n",
      "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n",
      "Collecting httpx<1,>=0.23.0 (from openai)\n",
      "  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.1)\n",
      "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai) (4.66.4)\n",
      "Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.10/dist-packages (from openai) (4.12.2)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
      "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.7)\n",
      "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.1)\n",
      "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2024.6.2)\n",
      "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)\n",
      "  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)\n",
      "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.10->langchain)\n",
      "  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n",
      "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.3.0,>=0.2.10->langchain) (24.1)\n",
      "Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)\n",
      "  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (141 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m141.1/141.1 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.7.0)\n",
      "Requirement already satisfied: pydantic-core==2.20.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.20.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.7)\n",
      "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
      "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.3.0,>=0.2.10->langchain)\n",
      "  Downloading jsonpointer-3.0.0-py2.py3-none-any.whl (7.6 kB)\n",
      "Installing collected packages: orjson, jsonpointer, h11, jsonpatch, httpcore, langsmith, httpx, openai, langchain-core, langchain-text-splitters, langchain\n",
      "Successfully installed h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 jsonpatch-1.33 jsonpointer-3.0.0 langchain-0.2.6 langchain-core-0.2.11 langchain-text-splitters-0.2.2 langsmith-0.1.83 openai-1.35.10 orjson-3.10.6\n",
      "Collecting pypdf\n",
      "  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: typing_extensions>=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.12.2)\n",
      "Installing collected packages: pypdf\n",
      "Successfully installed pypdf-4.2.0\n",
      "Collecting langchain_community\n",
      "  Downloading langchain_community-0.2.6-py3-none-any.whl (2.2 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (6.0.1)\n",
      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (2.0.31)\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (3.9.5)\n",
      "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)\n",
      "  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)\n",
      "Requirement already satisfied: langchain<0.3.0,>=0.2.6 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (0.2.6)\n",
      "Requirement already satisfied: langchain-core<0.3.0,>=0.2.10 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (0.2.11)\n",
      "Requirement already satisfied: langsmith<0.2.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (0.1.83)\n",
      "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (1.25.2)\n",
      "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (2.31.0)\n",
      "Requirement already satisfied: tenacity!=8.4.0,<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (8.4.2)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.3.1)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (23.2.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.4.1)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (6.0.5)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.9.4)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (4.0.3)\n",
      "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n",
      "  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.2/49.2 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n",
      "  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n",
      "Requirement already satisfied: langchain-text-splitters<0.3.0,>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from langchain<0.3.0,>=0.2.6->langchain_community) (0.2.2)\n",
      "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain<0.3.0,>=0.2.6->langchain_community) (2.8.0)\n",
      "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.3.0,>=0.2.10->langchain_community) (1.33)\n",
      "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.3.0,>=0.2.10->langchain_community) (24.1)\n",
      "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.0->langchain_community) (3.10.6)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2.0.7)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2024.6.2)\n",
      "Requirement already satisfied: typing-extensions>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain_community) (4.12.2)\n",
      "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain_community) (3.0.3)\n",
      "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.3.0,>=0.2.10->langchain_community) (3.0.0)\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain<0.3.0,>=0.2.6->langchain_community) (0.7.0)\n",
      "Requirement already satisfied: pydantic-core==2.20.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain<0.3.0,>=0.2.6->langchain_community) (2.20.0)\n",
      "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)\n",
      "  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n",
      "Installing collected packages: mypy-extensions, marshmallow, typing-inspect, dataclasses-json, langchain_community\n",
      "Successfully installed dataclasses-json-0.6.7 langchain_community-0.2.6 marshmallow-3.21.3 mypy-extensions-1.0.0 typing-inspect-0.9.0\n",
      "Collecting pymupdf\n",
      "  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting PyMuPDFb==1.24.6 (from pymupdf)\n",
      "  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: PyMuPDFb, pymupdf\n",
      "Successfully installed PyMuPDFb-1.24.6 pymupdf-1.24.7\n",
      "Collecting lancedb\n",
      "  Downloading lancedb-0.9.0-cp38-abi3-manylinux_2_28_x86_64.whl (20.9 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.9/20.9 MB\u001b[0m \u001b[31m55.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting deprecation (from lancedb)\n",
      "  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)\n",
      "Collecting pylance==0.13.0 (from lancedb)\n",
      "  Downloading pylance-0.13.0-cp39-abi3-manylinux_2_28_x86_64.whl (25.5 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m25.5/25.5 MB\u001b[0m \u001b[31m52.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting ratelimiter~=1.0 (from lancedb)\n",
      "  Downloading ratelimiter-1.2.0.post0-py3-none-any.whl (6.6 kB)\n",
      "Requirement already satisfied: requests>=2.31.0 in /usr/local/lib/python3.10/dist-packages (from lancedb) (2.31.0)\n",
      "Collecting retry>=0.9.2 (from lancedb)\n",
      "  Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)\n",
      "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from lancedb) (4.66.4)\n",
      "Requirement already satisfied: pydantic>=1.10 in /usr/local/lib/python3.10/dist-packages (from lancedb) (2.8.0)\n",
      "Requirement already satisfied: attrs>=21.3.0 in /usr/local/lib/python3.10/dist-packages (from lancedb) (23.2.0)\n",
      "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from lancedb) (24.1)\n",
      "Requirement already satisfied: cachetools in /usr/local/lib/python3.10/dist-packages (from lancedb) (5.3.3)\n",
      "Collecting overrides>=0.7 (from lancedb)\n",
      "  Downloading overrides-7.7.0-py3-none-any.whl (17 kB)\n",
      "Requirement already satisfied: pyarrow<15.0.1,>=12 in /usr/local/lib/python3.10/dist-packages (from pylance==0.13.0->lancedb) (14.0.2)\n",
      "Requirement already satisfied: numpy>=1.22 in /usr/local/lib/python3.10/dist-packages (from pylance==0.13.0->lancedb) (1.25.2)\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.10->lancedb) (0.7.0)\n",
      "Requirement already satisfied: pydantic-core==2.20.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.10->lancedb) (2.20.0)\n",
      "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.10->lancedb) (4.12.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.31.0->lancedb) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.31.0->lancedb) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.31.0->lancedb) (2.0.7)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.31.0->lancedb) (2024.6.2)\n",
      "Requirement already satisfied: decorator>=3.4.2 in /usr/local/lib/python3.10/dist-packages (from retry>=0.9.2->lancedb) (4.4.2)\n",
      "Collecting py<2.0.0,>=1.4.26 (from retry>=0.9.2->lancedb)\n",
      "  Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: ratelimiter, py, overrides, deprecation, retry, pylance, lancedb\n",
      "Successfully installed deprecation-2.1.0 lancedb-0.9.0 overrides-7.7.0 py-1.11.0 pylance-0.13.0 ratelimiter-1.2.0.post0 retry-0.9.2\n",
      "Collecting pymupdf4llm\n",
      "  Downloading pymupdf4llm-0.0.8-py3-none-any.whl (20 kB)\n",
      "Requirement already satisfied: pymupdf>=1.24.2 in /usr/local/lib/python3.10/dist-packages (from pymupdf4llm) (1.24.7)\n",
      "Requirement already satisfied: PyMuPDFb==1.24.6 in /usr/local/lib/python3.10/dist-packages (from pymupdf>=1.24.2->pymupdf4llm) (1.24.6)\n",
      "Installing collected packages: pymupdf4llm\n",
      "Successfully installed pymupdf4llm-0.0.8\n",
      "Collecting unstructured[md]\n",
      "  Downloading unstructured-0.14.9-py3-none-any.whl (2.1 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m24.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (5.2.0)\n",
      "Collecting filetype (from unstructured[md])\n",
      "  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)\n",
      "Collecting python-magic (from unstructured[md])\n",
      "  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)\n",
      "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (4.9.4)\n",
      "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (3.8.1)\n",
      "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (0.9.0)\n",
      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (2.31.0)\n",
      "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (4.12.3)\n",
      "Collecting emoji (from unstructured[md])\n",
      "  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m431.4/431.4 kB\u001b[0m \u001b[31m37.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: dataclasses-json in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (0.6.7)\n",
      "Collecting python-iso639 (from unstructured[md])\n",
      "  Downloading python_iso639-2024.4.27-py3-none-any.whl (274 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.7/274.7 kB\u001b[0m \u001b[31m28.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting langdetect (from unstructured[md])\n",
      "  Downloading langdetect-1.0.9.tar.gz (981 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m48.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "Requirement already satisfied: numpy<2 in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (1.25.2)\n",
      "Collecting rapidfuzz (from unstructured[md])\n",
      "  Downloading rapidfuzz-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting backoff (from unstructured[md])\n",
      "  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (4.12.2)\n",
      "Collecting unstructured-client (from unstructured[md])\n",
      "  Downloading unstructured_client-0.23.8-py3-none-any.whl (40 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (1.14.1)\n",
      "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (4.66.4)\n",
      "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from unstructured[md]) (3.6)\n",
      "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->unstructured[md]) (2.5)\n",
      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json->unstructured[md]) (3.21.3)\n",
      "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json->unstructured[md]) (0.9.0)\n",
      "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from langdetect->unstructured[md]) (1.16.0)\n",
      "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[md]) (8.1.7)\n",
      "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[md]) (1.4.2)\n",
      "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[md]) (2024.5.15)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->unstructured[md]) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->unstructured[md]) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->unstructured[md]) (2.0.7)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->unstructured[md]) (2024.6.2)\n",
      "Collecting deepdiff>=6.0 (from unstructured-client->unstructured[md])\n",
      "  Downloading deepdiff-7.0.1-py3-none-any.whl (80 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.8/80.8 kB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: httpx>=0.27.0 in /usr/local/lib/python3.10/dist-packages (from unstructured-client->unstructured[md]) (0.27.0)\n",
      "Collecting jsonpath-python>=1.0.6 (from unstructured-client->unstructured[md])\n",
      "  Downloading jsonpath_python-1.0.6-py3-none-any.whl (7.6 kB)\n",
      "Requirement already satisfied: mypy-extensions>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from unstructured-client->unstructured[md]) (1.0.0)\n",
      "Requirement already satisfied: nest-asyncio>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from unstructured-client->unstructured[md]) (1.6.0)\n",
      "Requirement already satisfied: packaging>=23.1 in /usr/local/lib/python3.10/dist-packages (from unstructured-client->unstructured[md]) (24.1)\n",
      "Requirement already satisfied: pypdf>=4.0 in /usr/local/lib/python3.10/dist-packages (from unstructured-client->unstructured[md]) (4.2.0)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from unstructured-client->unstructured[md]) (2.8.2)\n",
      "Collecting requests-toolbelt>=1.0.0 (from unstructured-client->unstructured[md])\n",
      "  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.5/54.5 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting ordered-set<4.2.0,>=4.1.0 (from deepdiff>=6.0->unstructured-client->unstructured[md])\n",
      "  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\n",
      "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.27.0->unstructured-client->unstructured[md]) (3.7.1)\n",
      "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx>=0.27.0->unstructured-client->unstructured[md]) (1.0.5)\n",
      "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.27.0->unstructured-client->unstructured[md]) (1.3.1)\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx>=0.27.0->unstructured-client->unstructured[md]) (0.14.0)\n",
      "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx>=0.27.0->unstructured-client->unstructured[md]) (1.2.1)\n",
      "Building wheels for collected packages: langdetect\n",
      "  Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=565bf7f90bb7797aa17417ba2726370799b5bdb06f243a18dd440c22c6463fce\n",
      "  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n",
      "Successfully built langdetect\n",
      "Installing collected packages: filetype, rapidfuzz, python-magic, python-iso639, ordered-set, langdetect, jsonpath-python, emoji, backoff, requests-toolbelt, deepdiff, unstructured-client, unstructured\n",
      "Successfully installed backoff-2.2.1 deepdiff-7.0.1 emoji-2.12.1 filetype-1.2.0 jsonpath-python-1.0.6 langdetect-1.0.9 ordered-set-4.1.0 python-iso639-2024.4.27 python-magic-0.4.27 rapidfuzz-3.9.4 requests-toolbelt-1.0.0 unstructured-0.14.9 unstructured-client-0.23.8\n",
      "Collecting langchain-openai\n",
      "  Downloading langchain_openai-0.1.14-py3-none-any.whl (45 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.9/45.9 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: langchain-community in /usr/local/lib/python3.10/dist-packages (0.2.6)\n",
      "Requirement already satisfied: langchain-core<0.3,>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from langchain-openai) (0.2.11)\n",
      "Requirement already satisfied: openai<2.0.0,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from langchain-openai) (1.35.10)\n",
      "Collecting tiktoken<1,>=0.7 (from langchain-openai)\n",
      "  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m26.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (6.0.1)\n",
      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.0.31)\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (3.9.5)\n",
      "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.6.7)\n",
      "Requirement already satisfied: langchain<0.3.0,>=0.2.6 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.2.6)\n",
      "Requirement already satisfied: langsmith<0.2.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.1.83)\n",
      "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (1.25.2)\n",
      "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.31.0)\n",
      "Requirement already satisfied: tenacity!=8.4.0,<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (8.4.2)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.1)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (23.2.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.4.1)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.0.5)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.9.4)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (4.0.3)\n",
      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.21.3)\n",
      "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n",
      "Requirement already satisfied: langchain-text-splitters<0.3.0,>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from langchain<0.3.0,>=0.2.6->langchain-community) (0.2.2)\n",
      "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain<0.3.0,>=0.2.6->langchain-community) (2.8.0)\n",
      "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.3,>=0.2.2->langchain-openai) (1.33)\n",
      "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.3,>=0.2.2->langchain-openai) (24.1)\n",
      "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.0->langchain-community) (3.10.6)\n",
      "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.32.0->langchain-openai) (3.7.1)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai<2.0.0,>=1.32.0->langchain-openai) (1.7.0)\n",
      "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.32.0->langchain-openai) (0.27.0)\n",
      "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.32.0->langchain-openai) (1.3.1)\n",
      "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.32.0->langchain-openai) (4.66.4)\n",
      "Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.10/dist-packages (from openai<2.0.0,>=1.32.0->langchain-openai) (4.12.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2.0.7)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2024.6.2)\n",
      "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain-community) (3.0.3)\n",
      "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken<1,>=0.7->langchain-openai) (2024.5.15)\n",
      "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=1.32.0->langchain-openai) (1.2.1)\n",
      "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai<2.0.0,>=1.32.0->langchain-openai) (1.0.5)\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=1.32.0->langchain-openai) (0.14.0)\n",
      "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.3,>=0.2.2->langchain-openai) (3.0.0)\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain<0.3.0,>=0.2.6->langchain-community) (0.7.0)\n",
      "Requirement already satisfied: pydantic-core==2.20.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain<0.3.0,>=0.2.6->langchain-community) (2.20.0)\n",
      "Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.0.0)\n",
      "Installing collected packages: tiktoken, langchain-openai\n",
      "Successfully installed langchain-openai-0.1.14 tiktoken-0.7.0\n",
      "Collecting gradio\n",
      "  Downloading gradio-4.37.2-py3-none-any.whl (12.3 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m73.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)\n",
      "  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
      "Requirement already satisfied: altair<6.0,>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (4.2.2)\n",
      "Collecting fastapi (from gradio)\n",
      "  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting ffmpy (from gradio)\n",
      "  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)\n",
      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "Collecting gradio-client==1.0.2 (from gradio)\n",
      "  Downloading gradio_client-1.0.2-py3-none-any.whl (318 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.2/318.2 kB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.27.0)\n",
      "Requirement already satisfied: huggingface-hub>=0.19.3 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.23.4)\n",
      "Requirement already satisfied: importlib-resources<7.0,>=1.3 in /usr/local/lib/python3.10/dist-packages (from gradio) (6.4.0)\n",
      "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.1.4)\n",
      "Requirement already satisfied: markupsafe~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.1.5)\n",
      "Requirement already satisfied: matplotlib~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.7.1)\n",
      "Requirement already satisfied: numpy<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (1.25.2)\n",
      "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.10.6)\n",
      "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from gradio) (24.1)\n",
      "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.0.3)\n",
      "Requirement already satisfied: pillow<11.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (9.4.0)\n",
      "Requirement already satisfied: pydantic>=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.8.0)\n",
      "Collecting pydub (from gradio)\n",
      "  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
      "Collecting python-multipart>=0.0.9 (from gradio)\n",
      "  Downloading python_multipart-0.0.9-py3-none-any.whl (22 kB)\n",
      "Requirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (6.0.1)\n",
      "Collecting ruff>=0.2.2 (from gradio)\n",
      "  Downloading ruff-0.5.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.1 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.1/10.1 MB\u001b[0m \u001b[31m90.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting semantic-version~=2.0 (from gradio)\n",
      "  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
      "Collecting tomlkit==0.12.0 (from gradio)\n",
      "  Downloading tomlkit-0.12.0-py3-none-any.whl (37 kB)\n",
      "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from gradio) (0.12.3)\n",
      "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (4.12.2)\n",
      "Requirement already satisfied: urllib3~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.0.7)\n",
      "Collecting uvicorn>=0.14.0 (from gradio)\n",
      "  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.4/62.4 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.0.2->gradio) (2023.6.0)\n",
      "Collecting websockets<12.0,>=10.0 (from gradio-client==1.0.2->gradio)\n",
      "  Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio) (0.4)\n",
      "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio) (4.19.2)\n",
      "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio) (0.12.1)\n",
      "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (3.7.1)\n",
      "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (2024.6.2)\n",
      "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (1.0.5)\n",
      "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (3.7)\n",
      "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio) (1.3.1)\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.14.0)\n",
      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.3->gradio) (3.15.4)\n",
      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.3->gradio) (2.31.0)\n",
      "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.3->gradio) (4.66.4)\n",
      "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (1.2.1)\n",
      "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (0.12.1)\n",
      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (4.53.0)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (1.4.5)\n",
      "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (3.1.2)\n",
      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2023.4)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2024.1)\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (0.7.0)\n",
      "Requirement already satisfied: pydantic-core==2.20.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio) (2.20.0)\n",
      "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (8.1.7)\n",
      "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n",
      "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio) (13.7.1)\n",
      "Collecting starlette<0.38.0,>=0.37.2 (from fastapi->gradio)\n",
      "  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.9/71.9 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting fastapi-cli>=0.0.2 (from fastapi->gradio)\n",
      "  Downloading fastapi_cli-0.0.4-py3-none-any.whl (9.5 kB)\n",
      "Collecting ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 (from fastapi->gradio)\n",
      "  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.6/53.6 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting email_validator>=2.0.0 (from fastapi->gradio)\n",
      "  Downloading email_validator-2.2.0-py3-none-any.whl (33 kB)\n",
      "Collecting dnspython>=2.0.0 (from email_validator>=2.0.0->fastapi->gradio)\n",
      "  Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.7/307.7 kB\u001b[0m \u001b[31m32.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (23.2.0)\n",
      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (2023.12.1)\n",
      "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.35.1)\n",
      "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.18.1)\n",
      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib~=3.0->gradio) (1.16.0)\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (3.0.0)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.16.1)\n",
      "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx>=0.24.1->gradio) (1.2.1)\n",
      "Collecting httptools>=0.5.0 (from uvicorn>=0.14.0->gradio)\n",
      "  Downloading httptools-0.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (341 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m35.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting python-dotenv>=0.13 (from uvicorn>=0.14.0->gradio)\n",
      "  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
      "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn>=0.14.0->gradio)\n",
      "  Downloading uvloop-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m88.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting watchfiles>=0.13 (from uvicorn>=0.14.0->gradio)\n",
      "  Downloading watchfiles-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m60.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.19.3->gradio) (3.3.2)\n",
      "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n",
      "Building wheels for collected packages: ffmpy\n",
      "  Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
      "  Created wheel for ffmpy: filename=ffmpy-0.3.2-py3-none-any.whl size=5584 sha256=de9a8f29aa541be61237e1471506c57d761a9594c2964e72ddcb3f93ebc9180f\n",
      "  Stored in directory: /root/.cache/pip/wheels/bd/65/9a/671fc6dcde07d4418df0c592f8df512b26d7a0029c2a23dd81\n",
      "Successfully built ffmpy\n",
      "Installing collected packages: pydub, ffmpy, websockets, uvloop, uvicorn, ujson, tomlkit, semantic-version, ruff, python-multipart, python-dotenv, httptools, dnspython, aiofiles, watchfiles, starlette, email_validator, gradio-client, fastapi-cli, fastapi, gradio\n",
      "Successfully installed aiofiles-23.2.1 dnspython-2.6.1 email_validator-2.2.0 fastapi-0.111.0 fastapi-cli-0.0.4 ffmpy-0.3.2 gradio-4.37.2 gradio-client-1.0.2 httptools-0.6.1 pydub-0.25.1 python-dotenv-1.0.1 python-multipart-0.0.9 ruff-0.5.1 semantic-version-2.10.0 starlette-0.37.2 tomlkit-0.12.0 ujson-5.10.0 uvicorn-0.30.1 uvloop-0.19.0 watchfiles-0.22.0 websockets-11.0.3\n",
      "Collecting tantivy\n",
      "  Downloading tantivy-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m42.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: tantivy\n",
      "Successfully installed tantivy-0.22.0\n"
     ]
    }
   ],
   "source": [
    "!pip install langchain openai\n",
    "!pip install -qU langchain-text-splitters\n",
    "!pip install pypdf\n",
    "!pip install langchain_community\n",
    "!pip install pymupdf\n",
    "!pip install lancedb\n",
    "!pip install pymupdf4llm\n",
    "!pip install \"unstructured[md]\"\n",
    "!pip install -U langchain-openai langchain-community\n",
    "!pip install gradio\n",
    "!pip install tantivy"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Download the data"
   ],
   "metadata": {
    "id": "JVzJDVA_HJ7x"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# Openai api key\n",
    "OPENAI_API_KEY = \"\""
   ],
   "metadata": {
    "id": "OJzDn_XzRJ1G"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "!wget https://github.com/akashAD98/dummy_data/raw/main/sample_credit_card.pdf"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "a21ep2UvBFyv",
    "outputId": "f8ad5b70-0fd4-4f44-d9e7-fb24e491d9ff"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--2024-07-07 16:53:04--  https://github.com/akashAD98/dummy_data/raw/main/sample_credit_card.pdf\n",
      "Resolving github.com (github.com)... 140.82.112.4\n",
      "Connecting to github.com (github.com)|140.82.112.4|:443... connected.\n",
      "HTTP request sent, awaiting response... 302 Found\n",
      "Location: https://raw.githubusercontent.com/akashAD98/dummy_data/main/sample_credit_card.pdf [following]\n",
      "--2024-07-07 16:53:05--  https://raw.githubusercontent.com/akashAD98/dummy_data/main/sample_credit_card.pdf\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 941934 (920K) [application/octet-stream]\n",
      "Saving to: ‘sample_credit_card.pdf’\n",
      "\n",
      "sample_credit_card. 100%[===================>] 919.86K  --.-KB/s    in 0.06s   \n",
      "\n",
      "2024-07-07 16:53:05 (15.3 MB/s) - ‘sample_credit_card.pdf’ saved [941934/941934]\n",
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "Import Packages"
   ],
   "metadata": {
    "id": "xn4tU3PFBGJq"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import os\n",
    "import shutil\n",
    "from datetime import datetime\n",
    "from tempfile import mkdtemp\n",
    "from typing import List\n",
    "\n",
    "import gradio as gr\n",
    "import pymupdf4llm\n",
    "from pydantic import BaseModel, Field, validator\n",
    "\n",
    "from langchain.chains.question_answering import load_qa_chain\n",
    "\n",
    "# from langchain.community.document_loaders import UnstructuredMarkdownLoader\n",
    "from langchain.document_loaders import UnstructuredMarkdownLoader\n",
    "from langchain_core.documents import Document\n",
    "from langchain_core.prompts import PromptTemplate\n",
    "from langchain.memory import ConversationBufferMemory\n",
    "from langchain.llms import OpenAI\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "\n",
    "\n",
    "from langchain_community.vectorstores import LanceDB\n",
    "from lancedb.rerankers import LinearCombinationReranker\n",
    "\n",
    "\n",
    "from langchain.output_parsers import PydanticOutputParser\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from lancedb.rerankers import LinearCombinationReranker"
   ],
   "metadata": {
    "id": "SFfLrcqBC3Vf"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "We are loading a PDF file and converting it to Markdown format, as LLMs (Language Learning Models) excel at understanding Markdown language. Due to the complexity of our PDF, we are employing this method. We are using the `pymupdf4llm` library, which is specifically designed for this purpose. For more information, you can refer to the documentation [here](https://pymupdf4llm.readthedocs.io/en/latest/)."
   ],
   "metadata": {
    "id": "TFloIO0TDRO5"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# Convert PDF to Markdown\n",
    "md_text = pymupdf4llm.to_markdown(\n",
    "    \"/content/sample_credit_card.pdf\", table_strategy=\"lines_strict\"\n",
    ")\n",
    "\n",
    "# Write Markdown string to a file\n",
    "output = open(\"output_credit_card.md\", \"w\")\n",
    "output.write(md_text)\n",
    "output.close()\n",
    "\n",
    "print(\"Converted to .md\")\n",
    "\n",
    "# Load the Markdown file\n",
    "loader = UnstructuredMarkdownLoader(\"/content/output_credit_card.md\")\n",
    "documents = loader.load()\n",
    "\n",
    "# Split documents into chunks\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
    "chunks = text_splitter.split_documents(documents)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "CTv0daiaDReR",
    "outputId": "28bee5b8-03e2-4976-f8dc-5bd8fc671e75"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Converted to .md\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n",
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /root/nltk_data...\n",
      "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "boY5NrG3GPWN"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Bank Statement Analysis Prompt\n"
   ],
   "metadata": {
    "id": "-sBJF4svGPl7"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# Define the prompt template\n",
    "template = \"\"\"\n",
    "Your ability to extract and summarize this information accurately is essential for effective credit card statement analysis.\n",
    "You are a financial expert AI chatbot having a conversation with a human.\n",
    "Your task is to provide accurate and helpful answers based on the extracted parts of a credit card statement.\n",
    "Pay close attention to the credit card statement's language, structure, and any cross-references to ensure comprehensive and precise extraction of information.\n",
    "Do not use prior knowledge or information from outside the context to answer the questions.\n",
    "Only use the information provided in the context to answer the questions.\n",
    "\n",
    "Credit Card Statement Extract:\n",
    "{context}\n",
    "\n",
    "Conversation History:\n",
    "{chat_history}\n",
    "\n",
    "Human: {human_input}\n",
    "Chatbot:\n",
    "\"\"\"\n",
    "\n",
    "prompt = PromptTemplate(\n",
    "    input_variables=[\"chat_history\", \"human_input\", \"context\"],\n",
    "    template=template,\n",
    ")"
   ],
   "metadata": {
    "id": "jjw4c4VgGX4k"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "\n",
    "\n",
    "# Set up conversation memory - using langchain memory\n",
    "memory = ConversationBufferMemory(memory_key=\"chat_history\", input_key=\"human_input\")\n",
    "\n",
    "\n",
    "# Initialize OpenAI embeddings\n",
    "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n",
    "\n",
    "\n",
    "# Load the QA chain\n",
    "chain = load_qa_chain(\n",
    "    OpenAI(temperature=0, openai_api_key= OPENAI_API_KEY)\n",
    "    chain_type=\"stuff\",\n",
    "    memory=memory,\n",
    "    prompt=prompt\n",
    ")\n",
    "\n",
    "# Define the query\n",
    "query = \"what is the minimum payment due\"\n",
    "\n",
    "# Initialize the re-ranker model\n",
    "reranker = LinearCombinationReranker(weight=0.3)\n",
    "\n",
    "# Perform similarity search with LanceDB\n",
    "docsearch = LanceDB.from_documents(chunks, embeddings, reranker=reranker)\n",
    "\n",
    "\n",
    "docs = docsearch.similarity_search(query)\n",
    "docs_score = docsearch.similarity_search_with_relevance_scores(query)\n",
    "print(\"Relevance score - \", docs_score[0][1])\n",
    "print(\"Text - \", docs_score[0][0].page_content[:1000])\n"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "xRBkp6-SAkuN",
    "outputId": "f95e59aa-05d1-477c-8191-edae010a7dac"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n",
      "  warn_deprecated(\n",
      "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The class `OpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.\n",
      "  warn_deprecated(\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Relevance score -  0.7782929097294156\n",
      "Text -  minimum payment each period, you will pay more in interest and it will take you longer\n",
      "to pay off your Non-Plan Balance. For example:\n",
      "\n",
      "New Balance $10,269.65\n",
      "Minimum Payment Due $205.39\n",
      "\n",
      "Credit Limit $26,400.00\n",
      "Available Credit $16,130.35\n",
      "\n",
      "Cash Advance Limit $4,600.00\n",
      "Available Cash $4,600.00\n",
      "\n",
      "If you make no additional\n",
      "charges and each month\n",
      "you pay...\n",
      "\n",
      "You will pay off the balance\n",
      "shown on this statement in\n",
      "about...\n",
      "\n",
      "And you will pay an\n",
      "estimated total of...\n",
      "\n",
      "Only the\n",
      "22 years $29,830\n",
      "Minimum Payment Due\n",
      "\n",
      "$14,640\n",
      "$407 3 years (Savings = $15,190)\n",
      "\n",
      "If you would like information about credit counseling services, call 1-888-733-4139.\n",
      "\n",
      "See page 2 for important information about your account.\n",
      "\n",
      "Please refer to the IMPORTANT NOTICES section on\n",
      "\n",
      "page 7.\n",
      "\n",
      "Continued on page 3\n",
      "\n",
      "Please fold on the perforation below, detach and return with your payment\n",
      "\n",
      "Payment Coupon Pay by Computer Pay by Phone Account Ending 7-73045\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# some example test"
   ],
   "metadata": {
    "id": "NhNfgTAbBAyL"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "op_statment = chain(\n",
    "    {\"input_documents\": docs, \"human_input\": query}, return_only_outputs=True\n",
    ")\n",
    "\n",
    "op_statment"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "fN5zr-fPA4kk",
    "outputId": "822335ce-620d-4bc8-c8cb-c245ec082532"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The method `Chain.__call__` was deprecated in langchain 0.1.0 and will be removed in 0.3.0. Use invoke instead.\n",
      "  warn_deprecated(\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "{'output_text': 'The minimum payment due on this credit card statement is $205.39. If you only make the minimum payment each period, you will end up paying more in interest and it will take you longer to pay off your Non-Plan Balance. For example, with a New Balance of $10,269.65, it would take approximately 22 years to pay off and you would end up paying an estimated total of $29,830.'}"
      ]
     },
     "metadata": {},
     "execution_count": 6
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "6NHXSKIWA4p0"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"What is the available credit for this credit card \"\n",
    "\n",
    "op_statment = chain(\n",
    "    {\"input_documents\": docs, \"human_input\": query}, return_only_outputs=True\n",
    ")\n",
    "\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "display(Markdown(op_statment[\"output_text\"]))"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 64
    },
    "id": "wRgPRVK9A4t_",
    "outputId": "d7d6a787-bd84-4f64-ca37-5222b796c203"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ],
      "text/markdown": "The available credit for this credit card is $16,130.35. This is the amount of credit that you have left to use on your card, after taking into account your current balance and any pending charges. It is important to keep track of your available credit to avoid going over your credit limit and incurring fees."
     },
     "metadata": {}
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"List all the fees and interest charged in this statement period \"\n",
    "\n",
    "\n",
    "op_statment = chain(\n",
    "    {\"input_documents\": docs, \"human_input\": query}, return_only_outputs=True\n",
    ")\n",
    "\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "display(Markdown(op_statment[\"output_text\"]))"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 64
    },
    "id": "qrSx0z43A6Ep",
    "outputId": "7669323b-5504-44e2-abab-cd43103a7a95"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ],
      "text/markdown": "The fees and interest charged in this statement period are $0.00. This includes any fees for late payments or cash advances, as well as any interest charged on your balance. It is important to pay your balance in full each month to avoid these fees and interest charges."
     },
     "metadata": {}
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"What is the new balance for the current statement? \"\n",
    "\n",
    "\n",
    "op_statment = chain(\n",
    "    {\"input_documents\": docs, \"human_input\": query}, return_only_outputs=True\n",
    ")\n",
    "\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "display(Markdown(op_statment[\"output_text\"]))"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 64
    },
    "id": "ZF5m3plaA7kH",
    "outputId": "7b5ec8e4-3d91-4794-9a86-294f3b1cc154"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ],
      "text/markdown": "The new balance for the current statement is $10,269.65. This includes any new charges made during the statement period, as well as any previous balance and payments/credits. It is important to pay off your balance in full each month to avoid carrying a balance and incurring interest charges."
     },
     "metadata": {}
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"What is the payment due date?\"\n",
    "\n",
    "op_statment = chain(\n",
    "    {\"input_documents\": docs, \"human_input\": query}, return_only_outputs=True\n",
    ")\n",
    "\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "display(Markdown(op_statment[\"output_text\"]))"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 64
    },
    "id": "6-WG-6GJAmAm",
    "outputId": "37515f4a-5735-4776-ebcd-5b581541f1e3"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ],
      "text/markdown": "The payment due date for this credit card statement is 10/22/23. It is important to make your payment by this date to avoid late fees and potential increases in your APR. You can make your payment online, by phone, or by mail using the payment coupon provided on the statement."
     },
     "metadata": {}
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "Gradio app with file upload support"
   ],
   "metadata": {
    "id": "HwIucTYfBMHQ"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import os\n",
    "import pymupdf4llm\n",
    "import gradio as gr\n",
    "from langchain.llms import OpenAI\n",
    "from tempfile import mkdtemp\n",
    "from langchain.chains.question_answering import load_qa_chain\n",
    "from langchain.memory import ConversationBufferMemory\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain_community.vectorstores import LanceDB\n",
    "from lancedb.rerankers import LinearCombinationReranker\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.document_loaders import UnstructuredMarkdownLoader\n",
    "\n",
    "\n",
    "OPENAI_API_KEY = 'your_api_key\"'\n",
    "# Temporary directory for file processing\n",
    "temp_dir = mkdtemp()\n",
    "\n",
    "# Define global variables for embeddings and docsearch\n",
    "embeddings = None\n",
    "docsearch = None\n",
    "\n",
    "\n",
    "# Function to process the uploaded PDF file\n",
    "def process_file(file):\n",
    "    global embeddings, docsearch\n",
    "\n",
    "    if file is None:\n",
    "        return \"No file uploaded. Please upload a PDF file.\"\n",
    "\n",
    "    # Convert PDF to markdown\n",
    "    md_text = pymupdf4llm.to_markdown(file.name, table_strategy=\"lines_strict\")\n",
    "\n",
    "    # Write markdown string to a file in the temporary directory\n",
    "    md_file_path = os.path.join(temp_dir, \"output_credit_card.md\")\n",
    "    with open(md_file_path, \"w\") as output:\n",
    "        output.write(md_text)\n",
    "\n",
    "    # Load the markdown file\n",
    "    loader = UnstructuredMarkdownLoader(md_file_path)\n",
    "    documents = loader.load()\n",
    "\n",
    "    # Split the document into chunks\n",
    "    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
    "    chunks = text_splitter.split_documents(documents)\n",
    "\n",
    "    # Setup embeddings and vector store\n",
    "    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n",
    "    reranker = LinearCombinationReranker(weight=0.3)\n",
    "    docsearch = LanceDB.from_documents(chunks, embeddings, reranker=reranker)\n",
    "\n",
    "    return \"File processed successfully. You can now ask questions about the credit card statement.\"\n",
    "\n",
    "\n",
    "# Function to process the query\n",
    "def process_query(query):\n",
    "    if docsearch is None:\n",
    "        return \"No document has been uploaded and processed yet. Please upload a PDF first.\"\n",
    "\n",
    "    docs = docsearch.similarity_search(query)\n",
    "    top_docs = docs[:2]  # Get only the top 2 documents\n",
    "    response = chain(\n",
    "        {\"input_documents\": top_docs, \"human_input\": query}, return_only_outputs=True\n",
    "    )\n",
    "    return response[\"output_text\"]\n",
    "\n",
    "\n",
    "# Define the prompt template\n",
    "template = \"\"\"\n",
    "Your ability to extract and summarize this information accurately is essential for effective credit card statement analysis.\n",
    "You are a financial expert AI chatbot having a conversation with a human.\n",
    "Your task is to provide accurate and helpful answers based on the extracted parts of a credit card statement.\n",
    "Pay close attention to the credit card statement's language, structure, and any cross-references to ensure comprehensive and precise extraction of information.\n",
    "Do not use prior knowledge or information from outside the context to answer the questions.\n",
    "\n",
    "If the human greets you then respond with a polite greeting.\n",
    "If the question is not related to the credit card statement, respond with \"Sorry, I don't know. Please ask questions related to the provided credit card statement.\"\n",
    "\n",
    "Only use the information provided in the context to answer the questions.\n",
    "\n",
    "Credit Card Statement Extract:\n",
    "{context}\n",
    "\n",
    "Conversation History:\n",
    "{chat_history}\n",
    "\n",
    "Human: {human_input}\n",
    "Chatbot:\n",
    "\"\"\"\n",
    "\n",
    "prompt = PromptTemplate(\n",
    "    input_variables=[\"chat_history\", \"human_input\", \"context\"], template=template\n",
    ")\n",
    "\n",
    "# Set up the memory and chain\n",
    "memory = ConversationBufferMemory(memory_key=\"chat_history\", input_key=\"human_input\")\n",
    "chain = load_qa_chain(\n",
    "    OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),\n",
    "    chain_type=\"stuff\",\n",
    "    memory=memory,\n",
    "    prompt=prompt,\n",
    ")\n",
    "\n",
    "# Define the Gradio interfaces\n",
    "upload_interface = gr.Interface(\n",
    "    fn=process_file,\n",
    "    inputs=gr.File(label=\"Upload your credit card statement PDF\"),\n",
    "    outputs=\"text\",\n",
    "    title=\"Credit Card Statement Upload\",\n",
    "    description=\"Upload your credit card statement PDF file to analyze.\",\n",
    ")\n",
    "\n",
    "chat_interface = gr.Interface(\n",
    "    fn=process_query,\n",
    "    inputs=gr.Textbox(lines=2, placeholder=\"Enter your query here...\"),\n",
    "    outputs=gr.Textbox(),\n",
    "    title=\"Credit Card Statement Analysis\",\n",
    "    description=\"Ask questions about the credit card statement.\",\n",
    ")\n",
    "\n",
    "iface = gr.TabbedInterface(\n",
    "    interface_list=[upload_interface, chat_interface],\n",
    "    tab_names=[\"Upload PDF\", \"Chat with AI\"],\n",
    ")\n",
    "\n",
    "# Launch the interface\n",
    "iface.launch(share=True, debug=True)\n",
    "\n",
    "# Clean up the temporary directory on exit\n",
    "import atexit\n",
    "\n",
    "\n",
    "def cleanup_temp_dir():\n",
    "    shutil.rmtree(temp_dir)\n",
    "\n",
    "\n",
    "atexit.register(cleanup_temp_dir)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 591
    },
    "id": "wPOEIZSXAmD-",
    "outputId": "bd60bf63-ed62-4fe0-9e4e-2aaf6a04d8ac"
   },
   "execution_count": null,
   "outputs": [
    {
     "metadata": {
      "tags": null
     },
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n",
      "Running on public URL: https://5d478a3efe1cef1d7b.gradio.live\n",
      "\n",
      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"https://5d478a3efe1cef1d7b.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Wl91vC9_AmGh"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "mVoy5qdcAmI1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "0b3yNrRzAmLu"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "8l_z2mUhFrYS"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "uPbd2gtuFra1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "oiHdJDAG3K4s"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}